package getOfdText;

import com.jfinal.server.undertow.PropExt.FileNotFoundException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
import java.util.zip.ZipFile;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;

/**
 * TODO 需要支持OFD文件的全文检索，因此需要解析OFD文件，读取XML中的所有文本信息
 * @author 李德才
 * @version V2.0
 * @date 2021/7/29 10:15
 */

public class GetOfdText {

  private static final String XML_PATH = "C:\\Users\\86159\\Desktop\\resoult\\OFD\\Doc_0\\Pages\\Page_0\\Content.xml";

  private static final String FOLDER_PATH = "C:\\Users\\86159\\Desktop\\resoult\\OFD\\Doc_0\\Pages";
  private static final String ZIP_FILE = "C:\\Users\\86159\\Desktop\\resoult\\OFD\\教育部发文测试副本.zip";
  private static final String OFD_PATH = "C:\\Users\\86159\\Desktop\\resoult\\OFD\\10页倾斜字体.ofd";

  public static void main(String[] args) {
    try {
      long start = System.currentTimeMillis();
      String ofdText = getOfdText(OFD_PATH);
      long end = System.currentTimeMillis();
      System.err.println(end - start);
      System.err.println(ofdText);
    } catch (Exception e) {
      e.printStackTrace();
    }
//    getXmlFileList(new File("C:\\Users\\86159\\Desktop\\resoult\\OFD\\10页倾斜字体 - 副本"));

  }

  /**
   * 获取OFD文件中的文字
   * @return java.lang.String
   * @author 李德才
   * @param: filePath
   * @date 2021/7/30 11:10
   */
  public static String getOfdText(String filePath) throws IOException {
    File zipFile = copyAndRenameFile(new File(filePath));
    String unzip = unzip(zipFile);
    StringBuilder builder = combinationText(new File(unzip));
    File zip = new File(filePath.replace(".ofd", ".zip"));
    File folder = new File(filePath.replace(".ofd", ""));
    deleteFile(zip);
    deleteFile(folder);
    return builder.toString();
  }


  /**
   * 递归删除文件
   * @return boolean
   * @author 李德才
   * @param: dirFile
   * @date 2021/7/30 11:02
   */
  public static boolean deleteFile(File dirFile) {
    // 如果dir对应的文件不存在，则退出
    if (!dirFile.exists()) {
      return false;
    }
    if (dirFile.isFile()) {
      return dirFile.delete();
    } else {
      for (File file : dirFile.listFiles()) {
        deleteFile(file);
      }
    }
    return dirFile.delete();
  }

  public static File copyAndRenameFile(File file) throws IOException {
    File zipFile = new File(file.getPath().replace(".ofd", ".zip"));
    FileOutputStream fileOutputStream = new FileOutputStream(zipFile);
    Files.copy(Paths.get(file.getPath()), fileOutputStream);
    fileOutputStream.close();
    return zipFile;
  }


  /**
   * 解压zip，返回文件夹路径
   * @return java.lang.String
   * @author 李德才
   * @param: file
   * @date 2021/7/30 9:23
   */
  public static String unzip(File file) throws ZipException {
    String path = file.getPath().replace(".zip", "");
    boolean unzipFile = unZip(file, path);
    if (!unzipFile) {
      throw new ZipException("zip 解压失败");
    }
    return path;
  }

  /**
   * @return void
   * @author 李德才
   * @param: srcFile destDirPath
   * @date 2021/7/30 9:23
   */
  public static boolean unZip(File srcFile, String destDirPath) throws RuntimeException {
    if (!srcFile.exists()) {
      throw new RuntimeException(srcFile.getPath() + "所指文件不存在");
    }
    // 开始解压
    ZipFile zipFile = null;
    try {
      zipFile = new ZipFile(srcFile);
      Enumeration<?> entries = zipFile.entries();
      while (entries.hasMoreElements()) {
        ZipEntry entry = (ZipEntry) entries.nextElement();
        // 如果是文件夹，就创建个文件夹
        if (entry.isDirectory()) {
          String dirPath = destDirPath + File.separator + entry.getName();
          File dir = new File(dirPath);
          boolean mkdirs = dir.mkdirs();
          if (!mkdirs) {
            throw new FileNotFoundException("文件夹创建失败");
          }
        } else {
          // 如果是文件，就先创建一个文件，然后用io流把内容copy过去
          File targetFile = new File(destDirPath + File.separator + entry.getName());
          // 保证这个文件的父文件夹必须要存在
          if (!targetFile.getParentFile().exists()) {
            boolean mkdirs = targetFile.getParentFile().mkdirs();
            if (!mkdirs) {
              throw new FileNotFoundException("文件夹创建失败");
            }
          }
          boolean newFile = targetFile.createNewFile();
          if (!newFile) {
            throw new FileNotFoundException("文件创建失败");
          }
          // 将压缩文件内容写入到这个文件中
          InputStream is = zipFile.getInputStream(entry);
          FileOutputStream fos = new FileOutputStream(targetFile);
          int len;
          byte[] buf = new byte[2048];
          while ((len = is.read(buf)) != -1) {
            fos.write(buf, 0, len);
          }
          // 关流顺序，先打开的后关闭
          fos.close();
          is.close();
        }
      }
    } catch (Exception e) {
      throw new RuntimeException("unzip error from ZipUtils", e);
    } finally {
      if (zipFile != null) {
        try {
          zipFile.close();
        } catch (IOException e) {
          e.printStackTrace();
        }
      }
    }
    return true;
  }


  /**
   * 将XML中的文字组合在一起
   * @return java.lang.StringBuilder
   * @author 李德才
   * @param: file
   * @date 2021/7/29 11:22
   */
  public static StringBuilder combinationText(File file) {
    StringBuilder builder = new StringBuilder();
    try {
      List<File> xmlList = getXmlFileList(file);
      for (File xmlFile : xmlList) {
        builder.append(getXmlText(xmlFile));
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
    return builder;
  }


  /**
   * 由于OFD 文件规范问题，需要读取XML文件获取真实的文字信息的XML文件路径
   * @return java.util.List<java.io.File>
   * @author 李德才
   * @param: file 压缩包解压后的文件夹路径
   * @date 2021/7/30 13:59
   */
  public static List<File> getXmlFileList(File file) {
    boolean checkFolderPath = checkFolderPath(file);
    List<File> xmlFile = new ArrayList<>();
    if (checkFolderPath) {
      List<String> contentXmlList = getContentXml(file);
      for (String contentXml : contentXmlList) {
        xmlFile.add(new File(contentXml));
      }
    }
    return xmlFile;
  }


  /**
   * 根据OFD文件中的配置信息获取xml文件的具体位置
   * @return java.util.List<java.lang.String>
   * @author 李德才
   * @param: file
   * @date 2021/7/30 15:37
   */
  public static List<String> getContentXml(File file) {
    List<String> contentXmlList = new ArrayList<>();
    //    首先获取自描述文件
    String ofdXmlPath = file.getPath() + File.separator + "OFD.xml";
    SAXReader reader = new SAXReader();
    Document document;
    try {
      document = reader.read(new File(ofdXmlPath));
      String documentPath = document.getRootElement().element("DocBody").element("DocRoot")
          .getTextTrim();
      document = reader.read(new File(file.getPath() + File.separator + documentPath));
      Element pages = document.getRootElement().element("Pages");
      List<Element> elements = pages.elements();
      String docFolder = new File(file.getPath() + File.separator + documentPath).getParentFile()
          .getPath();
      for (Element element : elements) {
        contentXmlList.add(docFolder + File.separator + element.attribute("BaseLoc").getValue());
      }
    } catch (DocumentException e) {
      e.printStackTrace();
    }
    return contentXmlList;
  }

  /**
   * 检查文件夹是否合法
   * @return void
   * @author 李德才
   * @param: file
   * @date 2021/7/29 10:40
   */
  public static boolean checkFolderPath(File file) {
    if (!file.exists()) {
      throw new FileNotFoundException("文件夹不存在");
    }
    if (!file.isDirectory()) {
      throw new FileNotFoundException("所指的文件对象不是文件夹");
    }
    if (Objects.requireNonNull(file.listFiles()).length == 0) {
      throw new FileNotFoundException("文件夹下没有找到电子文件");
    }
    return true;
  }

  /**
   * 获取XML中的所有文本,需要指定获取有意义的文本，其余文本没有必要获取
   * @return java.lang.String
   * @author 李德才
   * @param: file
   * @date 2021/7/29 10:25
   */
  public static String getXmlText(File file) throws DocumentException {
    SAXReader saxReader = new SAXReader();
    Document document = saxReader.read(file);
    Element rootElement = document.getRootElement();
    Element layer = rootElement.element("Content").element("Layer");
    List<Element> elements = layer.elements();
    StringBuilder builder = new StringBuilder();
    for (Element element : elements) {
      if ("TextObject".equals(element.getName())) {
        builder.append( element.element("TextCode").getTextTrim());
      }
    }
    Pattern p = Pattern.compile("\\s*|\t|\r|\n");
    Matcher m = p.matcher(builder.toString());
    return m.replaceAll("");
  }


}
